Langchain code

Updated on 16 May 2024
10 Minutes to read
Contributors

Print
Share
Dark
Light

Article summary

Did you find this summary helpful?

Thank you for your feedback!

Creating an Automated Feedback Pipeline with LangSmith

Manually analyzing text processed by your language model is useful, but not scalable. Automated metrics offer a solution. By adding these metrics to your LangSmith projects, you can track advanced metrics on your LLM's performance and user inputs directly from the dashboard. model-based feedback monitoring charts If the metrics reveal issues, you can isolate problematic runs for debugging or fine-tuning. This tutorial shows you how to set up an automated feedback pipeline for your language models. Steps: Filter Runs: First, identify the runs you want to evaluate. For details, refer to the Run Filtering Documentation. Define Feedback Logic: Create a chain or function to calculate the feedback metrics.

Send Feedback to LangSmith:

Use the client.create_feedback method to send metrics. Alternatively, use client.evaluate_run, which both evaluates and logs metrics for you. We'll be using LangSmith and the hub APIs, so make sure you have the necessary API keys.

import os
import os
from uuid import uuid4

unique_id = uuid4().hex[0:8]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"Tracing Walkthrough - {unique_id}"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "<YOUR-API-KEY>"  # Update to your API key

# Used by the agent in this tutorial
os.environ["OPENAI_API_KEY"] = "<YOUR-OPENAI-API-KEY>"
# Update with your API URL if using a hosted instance of Langsmith.
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
# Update with your API key
os.environ["LANGCHAIN_API_KEY"] = "YOUR API KEY"
# Update with your API URL if using a hosted instance of Langsmith.
os.environ["LANGCHAIN_HUB_API_URL"] = "https://api.hub.langchain.com"
# Update with your Hub API key
os.environ["LANGCHAIN_HUB_API_KEY"] = "YOUR API KEY"
project_name = "YOUR PROJECT NAME"  # Change to your project name
from langchain import hub
from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad.openai_tools import (
    format_to_openai_tool_messages,
)
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
from langchain_community.tools import DuckDuckGoSearchResults
from langchain_openai import ChatOpenAI

# Fetches the latest version of this prompt
prompt = hub.pull("wfh/langsmith-agent-prompt:5d466cbc")

llm = ChatOpenAI(
    model="gpt-3.5-turbo-16k",
    temperature=0,
)

tools = [
    DuckDuckGoSearchResults(
        name="duck_duck_go"
    ),  # General internet search using DuckDuckGo
]

llm_with_tools = llm.bind_tools(tools)

runnable_agent = (
    {
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: format_to_openai_tool_messages(
            x["intermediate_steps"]
        ),
    }
    | prompt
    | llm_with_tools
    | OpenAIToolsAgentOutputParser()
)

agent_executor = AgentExecutor(
    agent=runnable_agent, tools=tools, handle_parsing_errors=True
)
from langsmith import Client
from datetime import datetime

client = Client()
example_data = [
    ("Who trained Llama-v2?", "I'm sorry, but I don't have that information."),
    (
        "When did langchain first announce the hub?",
        "LangChain first announced the LangChain Hub on September 5, 2023.",
    ),
    (
        "What's LangSmith?",
        "LangSmith is a platform developed by LangChain for building production-grade LLM (Language Model) applications. It allows you to debug, test, evaluate, and monitor chains and intelligent agents built on any LLM framework. LangSmith seamlessly integrates with LangChain's open-source framework called LangChain, which is widely used for building applications with LLMs.\n\nLangSmith provides full visibility into model inputs and outputs at every step in the chain of events, making it easier to debug and analyze the behavior of LLM applications. It has been tested with early design partners and on internal workflows, and it has been found to help teams in various ways.\n\nYou can find more information about LangSmith on the official LangSmith documentation [here](https://docs.smith.langchain.com/). Additionally, you can read about the announcement of LangSmith as a unified platform for debugging and testing LLM applications [here](https://blog.langchain.dev/announcing-langsmith/).",
    ),
    (
        "What is the langsmith cookbook?",
        "I'm sorry, but I couldn't find any information about the \"Langsmith Cookbook\". It's possible that it may not be a well-known cookbook or it may not exist. Could you provide more context or clarify the name?",
    ),
    (
        "What is LangChain?",
        "I'm sorry, but I couldn't find any information about \"LangChain\". Could you please provide more context or clarify your question?",
    ),
    ("When was Llama-v2 released?", "Llama-v2 was released on July 18, 2023."),
]

for input_, output_ in example_data:
    client.create_run(
        name="ExampleRun",
        run_type="chain",
        inputs={"input": input_},
        outputs={"output": output_},
        project_name=project_name,
        end_time=datetime.utcnow(),
    )
midnight = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)

runs = list(
    client.list_runs(
        project_name=project_name, execution_order=1, start_time=midnight, errors=False
    )
)
import textstat
from langsmith.schemas import Run, Example
from langchain.schema.runnable import RunnableLambda


def compute_stats(run: Run) -> None:
    # Note: your chain's runs may have different keys.
    # Be sure to select the right field(s) to measure!
    if "input" not in run.inputs:
        return
    if run.feedback_stats and "smog_index" in run.feedback_stats:
        # If we are running this pipeline multiple times
        return
    text = run.inputs["input"]
    try:
        fns = [
            "flesch_reading_ease",
            "flesch_kincaid_grade",
            "smog_index",
            "coleman_liau_index",
            "automated_readability_index",
        ]
        metrics = {fn: getattr(textstat, fn)(text) for fn in fns}
        for key, value in metrics.items():
            client.create_feedback(
                run.id,
                key=key,
                score=value,  # The numeric score is used in the monitoring charts
                feedback_source_type="model",
            )
    except:
        pass

from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import collect_runs
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

chain = (
    prompt
    | ChatOpenAI(model="gpt-3.5-turbo", temperature=1).bind(
        functions=[
            {
                "name": "submit_scores",
                "description": "Submit the graded scores for a user question and bot response.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "relevance": {
                            "type": "integer",
                            "minimum": 0,
                            "maximum": 5,
                            "description": "Score indicating the relevance of the question to LangChain/LangSmith.",
                        },
                        "difficulty": {
                            "type": "integer",
                            "minimum": 0,
                            "maximum": 5,
                            "description": "Score indicating the complexity or difficulty of the question.",
                        },
                        "verbosity": {
                            "type": "integer",
                            "minimum": 0,
                            "maximum": 5,
                            "description": "Score indicating how verbose the question is.",
                        },
                        "specificity": {
                            "type": "integer",
                            "minimum": 0,
                            "maximum": 5,
                            "description": "Score indicating how specific the question is.",
                        },
                    },
                    "required": ["relevance", "difficulty", "verbosity", "specificity"],
                },
            }
        ]
    )
    | JsonOutputFunctionsParser()
)


def evaluate_run(run: Run) -> None:
    try:
        # Note: your chain's runs may have different keys.
        # Be sure to select the right field(s) to measure!
        if "input" not in run.inputs or not run.outputs or "output" not in run.outputs:
            return
        if run.feedback_stats and "specificity" in run.feedback_stats:
            # We have already scored this run
            # (if you're running this pipeline multiple times)
            return
        with collect_runs() as cb:
            result = chain.invoke(
                {
                    "question": run.inputs["input"][:3000],  # lazy truncation
                    "prediction": run.outputs["output"][:3000],
                },
            )
            for feedback_key, value in result.items():
                score = int(value) / 5
                client.create_feedback(
                    run.id,
                    key=feedback_key,
                    score=score,
                    source_run_id=cb.traced_runs[0].id,
                    feedback_source_type="model",
                )
    except Exception as e:
        pass


wrapped_function = RunnableLambda(evaluate_run)

prompt = hub.pull(
    "wfh/automated-feedback-example", api_url="https://api.hub.langchain.com"
)

from typing import Optional
from langchain import evaluation, callbacks
from langsmith import evaluation as ls_evaluation
class CompletenessEvaluator(ls_evaluation.RunEvaluator):
    def __init__(self):
        criteria_description = (
            "Does the answer provide sufficient and complete information"
            "to fully address all aspects of the question (Y)?"
            " Or does it lack important details (N)?"
        )
        self.evaluator = evaluation.load_evaluator(
            "criteria", criteria={"completeness": criteria_description}
        )

    def evaluate_run(
        self, run: Run, example: Optional[Example] = None
    ) -> ls_evaluation.EvaluationResult:
        if (
            not run.inputs
            or not run.inputs.get("input")
            or not run.outputs
            or not run.outputs.get("output")
        ):
            return ls_evaluation.EvaluationResult(key="completeness", score=None)
        question = run.inputs["input"]
        prediction = run.outputs["output"]
        with callbacks.collect_runs() as cb:
            result = self.evaluator.evaluate_strings(
                input=question, prediction=prediction
            )
            run_id = cb.traced_runs[0].id
        return ls_evaluation.EvaluationResult(
            key="completeness", evaluator_info={"__run": {"run_id": run_id}}, **result
        )



from typing import Optional
from langchain.evaluation import load_evaluator
from langsmith.evaluation import RunEvaluator, EvaluationResult
from langsmith.schemas import Run, Example



class HelpfulnessEvaluator(RunEvaluator):
    def __init__(self):
        self.evaluator = load_evaluator(
            "score_string", criteria="helpfulness", normalize_by=10
        )

    def evaluate_run(
        self, run: Run, example: Optional[Example] = None
    ) -> EvaluationResult:
        if (
            not run.inputs
            or not run.inputs.get("input")
            or not run.outputs
            or not run.outputs.get("output")
        ):
            return EvaluationResult(key="helpfulness", score=None)
        result = self.evaluator.evaluate_strings(
            input=run.inputs["input"], prediction=run.outputs["output"]
        )
        return EvaluationResult(
            **{"key": "helpfulness", "comment": result.get("reasoning"), **result}
        )

Was this article helpful?

What's Next

surya_latex

Table of contents

Creating an Automated Feedback Pipeline with LangSmith